This notebooks explores the results from running cell type annotation with SingleR using the NBAtlas. The NBAtlas reference was aggregated with SingelR prior to model training.

In this notebook, we visualize inferred cell type annotations directly, compare them to normal consensus cell types, and validate cell type assignments with marker genes.

Setup

options(readr.show_col_types = FALSE)
suppressPackageStartupMessages({
  library(ggplot2)
  library(patchwork)
  library(SingleCellExperiment)
})

theme_set(theme_bw())

# Define color ramp for shared use in the heatmaps
heatmap_col_fun <- circlize::colorRamp2(c(0, 1), colors = c("white", "darkslateblue"))
# Set heatmap padding option
ComplexHeatmap::ht_opt(TITLE_PADDING = grid::unit(0.6, "in"))

Paths

# The base path for the OpenScPCA repository, found by its (hidden) .git directory
repository_base <- rprojroot::find_root(rprojroot::is_git_root)

module_dir <- file.path(repository_base, "analyses", "cell-type-neuroblastoma-04")
ref_dir <- file.path(module_dir, "references")
results_dir <- file.path(module_dir, "results", "singler")
data_dir <- file.path(repository_base, "data", "current", "SCPCP000004")
# SingleR files
singler_files <- list.files(
  path = results_dir,
  pattern = "_singler-annotations\\.tsv",
  recursive = TRUE,
  full.names = TRUE
) |>
  # add names as library id
  purrr::set_names(
    \(x) {
      stringr::str_remove(basename(x), "_singler-annotations.tsv")
    }
  )

# merged SCE file
# OBTAINED FROM PORTAL DIRECTLY on 2025-07-09
sce_file <- file.path(
  data_dir,
  "SCPCP000004_merged.rds"
)

# broad consensus cell type groups
validation_url <- "https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/refs/heads/main/analyses/cell-type-consensus/references/consensus-validation-groups.tsv"

# palette files
recoded_palette_file <- file.path(
  module_dir,
  "palettes",
  "harmonized-cell-type-palette.tsv"
)

nbatlas_palette_file <- file.path(
  module_dir,
  "palettes",
  "nbatlas-marker-genes-palette.tsv"
)

# marker genes for validation
consensus_markers_file <- "https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/refs/heads/main/analyses/cell-type-consensus/references/validation-markers.tsv"
nbatlas_markers_file <- file.path(ref_dir, "nbatlas-marker-genes.tsv")

Functions

# Source Jaccard and heatmap utilities functions
source(file.path(module_dir, "scripts", "utils", "jaccard-utils.R"))

# Source additional utilities functions:
# - harmonize_celltypes()
# - faceted_umap()
# - generate_dotplot()
source(file.path(module_dir, "scripts", "utils", "celltype-utils.R"))

Prepare input data

Read SCE object to get consensus cell types and UMAP coordinates:

merged_sce <- readRDS(sce_file)
# immediately remove assays we don't need for space
assay(merged_sce, "spliced") <- NULL
assay(merged_sce, "counts") <- NULL

# RUH ROH!!!
# https://github.com/AlexsLemonade/scpcaTools/issues/298
merged_sce$cell_id <- colnames(merged_sce)

Read cell type data frames:

singler_df <- singler_files |>
  purrr::map(readr::read_tsv) |>
  purrr::list_rbind(names_to = "library_id") |>
  dplyr::select(-delta.next, -labels) |>
  dplyr::mutate(
    # add cell id column for unique rows & joining
    cell_id = glue::glue("{library_id}-{barcodes}"),
    # recode NA -> "unknown_singler"
    pruned.labels = ifelse(
      is.na(pruned.labels),
      "unknown_singler",
      pruned.labels
    )
  )

# validation data frames
validation_df <- readr::read_tsv(validation_url) |>
  dplyr::select(consensus_annotation, validation_group_annotation)
consensus_markers_df <- readr::read_tsv(consensus_markers_file)
nbatlas_markers_df <- readr::read_tsv(nbatlas_markers_file)


# set up palettes

# recoded to shared colors
recoded_palette_df <- readr::read_tsv(recoded_palette_file)
recoded_celltype_pal <- recoded_palette_df$hex_color
names(recoded_celltype_pal) <- recoded_palette_df$harmonized_label

# only the NBAtlas labels - use for validation dotplot
nbatlas_palette_df <- readr::read_tsv(nbatlas_palette_file)
nbatlas_celltype_pal <- nbatlas_palette_df$hex_color
names(nbatlas_celltype_pal) <- nbatlas_palette_df$NBAtlas_label

Join and prepare data for use:

celltype_df <- scuttle::makePerCellDF(
  merged_sce,
  use.coldata = c("barcodes", "sample_id", "library_id", "consensus_celltype_annotation"),
  use.dimred = c("UMAP")
) |>
  # there are repeated barcodes so we need to keep cell_id around
  tibble::rownames_to_column("cell_id") |>
  dplyr::rename(
    UMAP1 = UMAP.1,
    UMAP2 = UMAP.2,
    consensus_annotation = consensus_celltype_annotation
  ) |>
  dplyr::left_join(validation_df, by = "consensus_annotation") |>
  # recode NAs to "unknown_consensus" and remove full consensus labels
  dplyr::mutate(validation_group_annotation = ifelse(
    is.na(validation_group_annotation),
    "unknown_consensus",
    validation_group_annotation
  )) |>
  dplyr::select(-consensus_annotation) |>
  dplyr::left_join(singler_df, by = c("cell_id", "barcodes", "library_id"))

# Recode NBAtlas cell types where possible so they match with colors
celltype_recoded_df <- celltype_df |>
  # rename to make annotation sources more clear
  dplyr::rename(
    "consensus_validation" = validation_group_annotation,
    "singler" = pruned.labels
  ) |>
  # pivot longer for wrangling
  tidyr::pivot_longer(
    c(consensus_validation, singler),
    names_to = "annotation_type",
    values_to = "label"
  ) |> 
  harmonize_celltypes(label, label_recoded) |>
  dplyr::select(-label)

Cell types in NBAtlas

Throughout this notebook, we compare the cell type labels from SingleR with the consensus cell type labels. To facilitate this, the resulting SingleR labels from NBAtlas were grouped together to both a) mirror (where possible) the broad consensus labels, and b) to make some visualizations more manageable.

The table below shows the NBAtlas cell types and how they are represented in visualizations, unless otherwise stated.

Broad label shown in figures Associated NBAtlas labels
T cell CD4+ T cell, CD8+ T cell, Treg, NKT cell
dendritic cell cDC1, cDC2/DC3, Migratory cDC
natural killer cell cell Circulating NK cell, TOX2+/KIT+ NK cell, Resident NK cell
monocyte CD4+ T cell, Classical monocyte, Patrolling monocyte
myeloid Neutrophil
Neuroendocrine Neuroendocrine
fibroblast Fibroblast
macrophage Macrophage
B cell B cell
plasma cell Plasma
endothelial Endothelial
Stromal other (NBAtlas) Stromal other
Schwann Schwann
RBCs RBCs
Immune cycling Immune cycling
pDC pDC

Cells labeled singler_unknown are those for which SingleR could not reliably assign an annotation.

Heatmap

This section compares SingleR annotations to consensus cell type annotations using a heatmap. The heatmap is colored by Jaccard similarity index.

# Pivot wider for heatmap functions
celltype_recoded_wide_df <- celltype_recoded_df |>
  tidyr::pivot_wider(
    names_from = annotation_type,
    values_from = label_recoded
  )

heatmap_col_fun <- circlize::colorRamp2(c(0, 1), colors = c("white", "darkslateblue"))

make_jaccard_heatmap(
  celltype_recoded_wide_df,
  "consensus_validation",
  "singler",
  "Consensus validation label",
  "SingleR NBAtlas label"
)

We broadly see high compatibility between SingleR and consensus labels. In addition, SingleR mostly classifies the unknown consensus cells as Neuroendocrine, which is consistent with our expectation that these are mostly tumor cells.

UMAPs

This section visualizes and compares SingleR annotations to consensus cell type annotations using UMAPs. Note that the displayed UMAP is from a merged object that has not been batch-corrected.

Cell type labels have been harmonized between sources wherever possible. Note that each set of labels has its own “stromal” category which the labels distinguish. In addition, cells labeled unknown_consensus are those with no assigned consensus label, and cells labeled unknown_singler are those where SingleR could not confidently assign a label.

Complete UMAP

First, we display the consensus and SingleR annotations for all cells.

ggplot(celltype_recoded_df) +
  aes(x = UMAP1, y = UMAP2, color = label_recoded) +
  geom_point(size = 0.25, alpha = 0.5) +
  scale_color_manual(
    values = recoded_celltype_pal,
    name = "Harmonized cell types"
  ) +
  facet_wrap(vars(annotation_type)) +
  coord_equal() +
  theme(
    legend.position = "bottom",
    axis.ticks = element_blank(),
    axis.text = element_blank()
  ) +
  guides(color = guide_legend(override.aes = list(size = 2, alpha = 1)))

SingleR annotations only

Below we display a faceted UMAP to highlight the SingleR annotations. Light gray cells in each panel represent all other cell types.

celltype_recoded_df |>
  dplyr::filter(annotation_type == "singler") |>
  faceted_umap(
    annotation_column = label_recoded,
    celltype_colors = recoded_celltype_pal
  )

Faceted comparison to consensus cell types

Below, we display faceted UMAPs highlighting a single cell type but considering only cell types that have direct correspondence between SingleR and consensus cell types. This allows us to see if the normal cells that SingleR is labeling correspond well to the normal cells identified by consensus labels. Each row displays a cell type where the left column shows the consensus version and the right column shows the SingleR version.

In addition, we include a category below putative-tumor to directly compare the unknown consensus labels with the Neuroendocrine cells labeled by SingleR. While these categories are not necessarily directly comparable, they are each most likely to contain tumor cells.

Also, note that the myeloid-labeled SingleR cells are only neutrophils, but the consensus myeloid grouping contains additional cell types.

direct_celltype_matches <- c(
  "B cell",
  "T cell",
  "myeloid",
  "macrophage",
  "monocyte",
  "dendritic cell",
  "natural killer cell",
  "fibroblast",
  "endothelial cell",
  "plasma cell",
  "natural killer cell",
  # we'll use this label to be able to directly compare unknown_consensus to Neuroendocrine
  "putative-tumor"
)


celltype_facet_df <- celltype_recoded_df |>
  dplyr::mutate(
    # recode so Neuroendocrine matches with unknown_consensus in the plot
    label_recoded = ifelse(
      label_recoded %in% c("Neuroendocrine", "unknown_consensus"),
      "putative-tumor",
      label_recoded
    )) |>
  dplyr::filter(label_recoded %in% direct_celltype_matches)

faceted_umap(
  celltype_facet_df,
  annotation_column = label_recoded,
  celltype_colors = recoded_celltype_pal,
  facet_type = "grid",
  annotation_type_column = annotation_type
)

There appears to be a reasonable correspondence between consensus- and SingleR-colored UMAPS for each cell type. We see that SingleR classified more cells compared to consensus, which we expected, and we also see that the additional cells that SingleR labeled are roughly in the same neighborhood as their corresponding consensus labels.

Marker gene expression dotplots

In this section we’ll validate annotations using two sets of marker genes:

# Prepare the expressed_genes vector
# we only care about if that gene is expressed otherwise we won't waste memory and include it
gene_sums <- rowData(merged_sce) |>
  as.data.frame() |>
  dplyr::select(contains("detected")) |>
  as.matrix() |>
  rowSums()
expressed_genes <- names(gene_sums)[gene_sums > 0]

NBAtlas marker genes

This dotplot shows the top-5 highest log2FC marker genes per NBAtlas cell type for validation. Marker genes are taken directly from the NBAtlas paper where they were identified with Seurat::FindMarkers().

This plot does not show the recoded SingleR labels but rather all labels that have associated marker genes; therefore, there are more cell type categories in this plot than in other plots in this report.

# number of marker genes to consider per validation group
n_marker_genes <- 5

top_nbatlas_markers_df <- nbatlas_markers_df |>
  # keep only the top `n_marker_genes`
  dplyr::filter(direction == "up") |>
  dplyr::group_by(NBAtlas_label) |>
  dplyr::mutate(rank_lfc = rank(-avg_log2FC)) |>
  dplyr::ungroup() |>
  dplyr::filter(rank_lfc <= n_marker_genes) |>
  dplyr::select(marker_gene_label = NBAtlas_label, gene_symbol, ensembl_gene_id)
# we want to plot the literal labels here
full_celltype_df <- celltype_df |>
  dplyr::select(
    cell_id, 
    # the dotplot function expects this column name
    label_recoded = pruned.labels
  ) |>
  dplyr::mutate(
    label_recoded = dplyr::case_when(
      # collapse the dendritic cells & monocytes to match what's in the marker genes
      stringr::str_detect(label_recoded, "cDC") ~ "cDC", 
      stringr::str_detect(label_recoded, "monocyte") ~ "Monocyte",
      label_recoded == "NE" ~ "Neuroendocrine",
      .default = label_recoded
  )) |>
  dplyr::filter(label_recoded != "unknown_singler")


# get total number of cells per final annotation group and set up y_label
total_cells_df <- full_celltype_df |>
  dplyr::count(label_recoded, name = "total_cells") |>
  dplyr::arrange(desc(total_cells)) |>
  dplyr::mutate(y_label = glue::glue("{label_recoded} ({total_cells})"))

singler_order <- total_cells_df$y_label
total_cells_df$y_label <- factor(total_cells_df$y_label, levels = singler_order)
nbatlas_bar_order <- total_cells_df$label_recoded
generate_dotplot(
  merged_sce = merged_sce,
  markers_df = top_nbatlas_markers_df,
  singler_df = full_celltype_df,
  total_cells_df = total_cells_df,
  expressed_genes = expressed_genes,
  bar_order = nbatlas_bar_order, 
  celltype_palette = nbatlas_celltype_pal, 
  min_cells = 0
)
`summarise()` has grouped output by 'label_recoded', 'marker_gene_label'. You can override using the `.groups` argument.

For validation, we expect to see high marker gene expression along the diagonal of the dotplot where cell type labels match with their marker genes. We do broadly see high expression along the diagonal, which means our labeled cells have some amount of similar signal to those in NBAtlas. Some sets of marker genes show high expression across multiple cell type categories in particular for closely related cell types (e.g. monocyte, macrophage, myeloid, dendritic, or T-cells and NK-cells), but there do not appear to be any strongly unexpected gene expression patterns.

Consensus validation marker genes

This dotplot shows expression of consensus validation marker genes across matching cell types labeled with SingleR.

# Prepare data frame with singler labels to plot
singler_recoded_df <- celltype_recoded_wide_df |>
  # we don't consider the NAs here
  dplyr::filter(singler != "unknown_singler") |>
  dplyr::select(cell_id, label_recoded = singler)


# get total number of cells per final annotation group and set up y_label
total_cells_df <- singler_recoded_df |>
  dplyr::count(label_recoded, name = "total_cells") |>
  dplyr::arrange(desc(total_cells)) |>
  dplyr::mutate(y_label = glue::glue("{label_recoded} ({total_cells})"))

singler_order <- total_cells_df$y_label
total_cells_df$y_label <- factor(total_cells_df$y_label, levels = singler_order)
# prepare markers for dotplot
dotplot_consensus_markers_df <- consensus_markers_df |>
  # consider only matching labels
  dplyr::filter(validation_group_annotation %in% singler_recoded_df$label_recoded) |>
  dplyr::select(
    marker_gene_label = validation_group_annotation,
    ensembl_gene_id,
    gene_symbol
  )

# get the bar order
consensus_bar_order <- total_cells_df |>
  dplyr::select(label_recoded, y_label) |>
  # inner!!!!
  dplyr::inner_join(
    dotplot_consensus_markers_df, 
    by = c("label_recoded" = "marker_gene_label")
  ) |>
  dplyr::arrange(y_label) |>
  dplyr::pull(label_recoded) |>
  unique()
generate_dotplot(
  merged_sce = merged_sce,
  markers_df = dotplot_consensus_markers_df,
  singler_df = singler_recoded_df,
  total_cells_df = total_cells_df,
  expressed_genes = expressed_genes,
  bar_order = consensus_bar_order, 
  celltype_palette = recoded_celltype_pal, 
  min_cells = 0
)
`summarise()` has grouped output by 'label_recoded', 'marker_gene_label'. You can override using the `.groups` argument.

Again, marker gene expression tends to be high in the matching cell type categories, with the possible exception of B cell which has somewhat lower gene expression of fewer genes; but, there is still some expected expression there. We also see some “promiscuity” where marker genes for closely-related cell types show high expression.

Compare expression when SingleR and consensus annotations match

Next, we look at expression of consensus validation marker genes for each normal/harmonized cell type label. We compare expression across groups of cells:

Each panel contains only cells that SingleR annotated with the given label, and shows only marker genes associated with that given cell type. We expect that cells whose consensus and SingleR annotations agree may show higher gene expression for the given marker.

# this time we are not considering putative tumor
direct_celltype_matches <- c(
  "B cell",
  "T cell",
  # note that we need to collapse the consensus to compare
  "myeloid",
  "fibroblast",
  "endothelial cell",
  "plasma cell",
  "natural killer cell", 
  "unknown_consensus" # we do want to keep these for this analysis
)

# Define the different myeloid types in consensus annotations
consensus_myeloid <- c("myeloid", "dendritic cell", "monocyte", "macrophage")


# Data frame with column `compare_annotations` that indicates
# if consensus and singler (roughly) match
compare_annotations_df <- celltype_recoded_df |>
  # recode labels for comparison
  dplyr::mutate(
    label_recoded = ifelse(
      label_recoded %in% consensus_myeloid,
      "myeloid",
      label_recoded
  )) |>
  dplyr::filter(label_recoded %in% direct_celltype_matches) |>
  dplyr::select(cell_id, annotation_type, label_recoded) |>
  # make it wider to label the situation
  tidyr::pivot_wider(
    names_from = "annotation_type",
    values_from = "label_recoded"
  ) |>
  # remove NAs:
  # - consensus NAs are those with other cell types not to include here
  # - singler NAs were unclassified cells
  tidyr::drop_na() |>
  dplyr::mutate(
    compare_annotations = dplyr::case_when(
      consensus_validation == "unknown_consensus" ~ "consensus not classified", 
      consensus_validation == singler ~ "matching annotations",
      consensus_validation != singler ~ "different annotations"
    ), 
    compare_annotations = forcats::fct_relevel(compare_annotations, "matching annotations", "different annotations", "consensus not classified")
  ) |>
  # we can remove the consensus annotation now
  dplyr::select(cell_id, singler, compare_annotations)


direct_markers_df <- consensus_markers_df |>
  dplyr::filter(validation_group_annotation %in% direct_celltype_matches) |>
  dplyr::select(marker_gene_label = validation_group_annotation, ensembl_gene_id, gene_symbol)
cells <- unique(compare_annotations_df$cell_id)
genes <- unique(direct_markers_df$ensembl_gene_id)
compare_annotations_expr_df <- scuttle::makePerCellDF(
  merged_sce[genes, cells],
  features = genes,
  assay.type = "logcounts",
  use.coldata = "cell_id",
  use.dimred = FALSE
) |>
  tidyr::pivot_longer(starts_with("ENSG"), names_to = "ensembl_gene_id", values_to = "logcounts") |>
  # filter to only cells with expression
  dplyr::filter(logcounts > 0) |>
  # join other data frames
  dplyr::left_join(compare_annotations_df, by = "cell_id") |>
  dplyr::left_join(direct_markers_df, by = "ensembl_gene_id", relationship = "many-to-many")
unique(compare_annotations_expr_df$singler) |>
  purrr::map(
    \(celltype) {
      
      plot_df <- compare_annotations_expr_df |>
        dplyr::filter(singler == celltype, marker_gene_label == celltype) 
      
      ggplot(plot_df) + 
        aes(x = gene_symbol, y = logcounts, fill = compare_annotations) +
        geom_boxplot(outlier.size = 0.25) + 
        ggtitle(glue::glue("cells annotated as: {celltype}"))
        
  }) |>
  patchwork::wrap_plots(ncol = 1) + patchwork::plot_layout(guides = "collect")

In most cases, gene expression for cells with matching consensus/SingleR annotations is higher or about the same as is gene expression for cells with different annotations.

Session Info

# record the versions of the packages used in this analysis and other environment information
sessionInfo()
R version 4.4.0 (2024-04-24)
Platform: aarch64-apple-darwin20
Running under: macOS 15.5

Matrix products: default
BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: America/New_York
tzcode source: internal

attached base packages:
[1] stats4    stats     graphics  grDevices datasets  utils     methods   base     

other attached packages:
 [1] SingleCellExperiment_1.26.0 SummarizedExperiment_1.34.0 Biobase_2.64.0              GenomicRanges_1.56.2        GenomeInfoDb_1.40.1        
 [6] IRanges_2.38.1              S4Vectors_0.42.1            BiocGenerics_0.50.0         MatrixGenerics_1.16.0       matrixStats_1.5.0          
[11] patchwork_1.3.0             ggplot2_3.5.2              

loaded via a namespace (and not attached):
 [1] tidyselect_1.2.1          viridisLite_0.4.2         dplyr_1.1.4               farver_2.1.2              bitops_1.0-9              digest_0.6.37            
 [7] lifecycle_1.0.4           cluster_2.1.8             Cairo_1.6-2               magrittr_2.0.3            compiler_4.4.0            rlang_1.1.6              
[13] tools_4.4.0               yaml_2.3.10               knitr_1.50                labeling_0.4.3            S4Arrays_1.4.1            bit_4.6.0                
[19] curl_6.3.0                DelayedArray_0.30.1       plyr_1.8.9                RColorBrewer_1.1-3        abind_1.4-8               BiocParallel_1.38.0      
[25] withr_3.0.2               purrr_1.0.4               grid_4.4.0                beachmat_2.20.0           colorspace_2.1-1          scales_1.4.0             
[31] iterators_1.0.14          cli_3.6.5                 crayon_1.5.3              generics_0.1.4            httr_1.4.7                tzdb_0.5.0               
[37] rjson_0.2.23              DelayedMatrixStats_1.26.0 scuttle_1.14.0            stringr_1.5.1             zlibbioc_1.50.0           parallel_4.4.0           
[43] BiocManager_1.30.26       XVector_0.44.0            vctrs_0.6.5               Matrix_1.7-1              jsonlite_2.0.0            GetoptLong_1.0.5         
[49] hms_1.1.3                 bit64_4.6.0-1             clue_0.3-66               jpeg_0.1-11               foreach_1.5.2             tidyr_1.3.1              
[55] glue_1.8.0                codetools_0.2-20          stringi_1.8.7             shape_1.4.6.1             gtable_0.3.6              UCSC.utils_1.0.0         
[61] ComplexHeatmap_2.20.0     tibble_3.3.0              pillar_1.10.2             GenomeInfoDbData_1.2.12   circlize_0.4.16           R6_2.6.1                 
[67] sparseMatrixStats_1.16.0  doParallel_1.0.17         rprojroot_2.0.4           vroom_1.6.5               evaluate_1.0.3            lattice_0.22-6           
[73] readr_2.1.5               png_0.1-8                 renv_1.1.3                ggmap_4.0.1               Rcpp_1.0.14               SparseArray_1.4.8        
[79] xfun_0.52                 forcats_1.0.0             pkgconfig_2.0.3           GlobalOptions_0.1.2      
---
title: "Explore the SingleR results"
author: "Stephanie J. Spielman"
date: "`r Sys.Date()`"
output:
  html_notebook:
    toc: true
    toc_depth: 3
    code_folding: hide
---

This notebooks explores the results from running cell type annotation with `SingleR` using the NBAtlas.
The NBAtlas reference was aggregated with `SingelR` prior to model training.

In this notebook, we visualize inferred cell type annotations directly, compare them to normal consensus cell types, and validate cell type assignments with marker genes.


## Setup

```{r, warning = FALSE}
options(readr.show_col_types = FALSE)
suppressPackageStartupMessages({
  library(ggplot2)
  library(patchwork)
  library(SingleCellExperiment)
})

theme_set(theme_bw())

# Define color ramp for shared use in the heatmaps
heatmap_col_fun <- circlize::colorRamp2(c(0, 1), colors = c("white", "darkslateblue"))
# Set heatmap padding option
ComplexHeatmap::ht_opt(TITLE_PADDING = grid::unit(0.6, "in"))
```


### Paths

```{r base paths}
# The base path for the OpenScPCA repository, found by its (hidden) .git directory
repository_base <- rprojroot::find_root(rprojroot::is_git_root)

module_dir <- file.path(repository_base, "analyses", "cell-type-neuroblastoma-04")
ref_dir <- file.path(module_dir, "references")
results_dir <- file.path(module_dir, "results", "singler")
data_dir <- file.path(repository_base, "data", "current", "SCPCP000004")
```


```{r file paths}
# SingleR files
singler_files <- list.files(
  path = results_dir,
  pattern = "_singler-annotations\\.tsv",
  recursive = TRUE,
  full.names = TRUE
) |>
  # add names as library id
  purrr::set_names(
    \(x) {
      stringr::str_remove(basename(x), "_singler-annotations.tsv")
    }
  )

# merged SCE file
# OBTAINED FROM PORTAL DIRECTLY on 2025-07-09
sce_file <- file.path(
  data_dir,
  "SCPCP000004_merged.rds"
)

# broad consensus cell type groups
validation_url <- "https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/refs/heads/main/analyses/cell-type-consensus/references/consensus-validation-groups.tsv"

# palette files
recoded_palette_file <- file.path(
  module_dir,
  "palettes",
  "harmonized-cell-type-palette.tsv"
)

nbatlas_palette_file <- file.path(
  module_dir,
  "palettes",
  "nbatlas-marker-genes-palette.tsv"
)

# marker genes for validation
consensus_markers_file <- "https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/refs/heads/main/analyses/cell-type-consensus/references/validation-markers.tsv"
nbatlas_markers_file <- file.path(ref_dir, "nbatlas-marker-genes.tsv")
```

### Functions

```{r}
# Source Jaccard and heatmap utilities functions
source(file.path(module_dir, "scripts", "utils", "jaccard-utils.R"))

# Source additional utilities functions:
# - harmonize_celltypes()
# - faceted_umap()
# - generate_dotplot()
source(file.path(module_dir, "scripts", "utils", "celltype-utils.R"))
```



### Prepare input data

Read SCE object to get consensus cell types and UMAP coordinates:

```{r}
merged_sce <- readRDS(sce_file)
# immediately remove assays we don't need for space
assay(merged_sce, "spliced") <- NULL
assay(merged_sce, "counts") <- NULL

# RUH ROH!!!
# https://github.com/AlexsLemonade/scpcaTools/issues/298
merged_sce$cell_id <- colnames(merged_sce)
```

Read cell type data frames:

```{r}
singler_df <- singler_files |>
  purrr::map(readr::read_tsv) |>
  purrr::list_rbind(names_to = "library_id") |>
  dplyr::select(-delta.next, -labels) |>
  dplyr::mutate(
    # add cell id column for unique rows & joining
    cell_id = glue::glue("{library_id}-{barcodes}"),
    # recode NA -> "unknown_singler"
    pruned.labels = ifelse(
      is.na(pruned.labels),
      "unknown_singler",
      pruned.labels
    )
  )

# validation data frames
validation_df <- readr::read_tsv(validation_url) |>
  dplyr::select(consensus_annotation, validation_group_annotation)
consensus_markers_df <- readr::read_tsv(consensus_markers_file)
nbatlas_markers_df <- readr::read_tsv(nbatlas_markers_file)


# set up palettes

# recoded to shared colors
recoded_palette_df <- readr::read_tsv(recoded_palette_file)
recoded_celltype_pal <- recoded_palette_df$hex_color
names(recoded_celltype_pal) <- recoded_palette_df$harmonized_label

# only the NBAtlas labels - use for validation dotplot
nbatlas_palette_df <- readr::read_tsv(nbatlas_palette_file)
nbatlas_celltype_pal <- nbatlas_palette_df$hex_color
names(nbatlas_celltype_pal) <- nbatlas_palette_df$NBAtlas_label

```

Join and prepare data for use:

```{r}
celltype_df <- scuttle::makePerCellDF(
  merged_sce,
  use.coldata = c("barcodes", "sample_id", "library_id", "consensus_celltype_annotation"),
  use.dimred = c("UMAP")
) |>
  # there are repeated barcodes so we need to keep cell_id around
  tibble::rownames_to_column("cell_id") |>
  dplyr::rename(
    UMAP1 = UMAP.1,
    UMAP2 = UMAP.2,
    consensus_annotation = consensus_celltype_annotation
  ) |>
  dplyr::left_join(validation_df, by = "consensus_annotation") |>
  # recode NAs to "unknown_consensus" and remove full consensus labels
  dplyr::mutate(validation_group_annotation = ifelse(
    is.na(validation_group_annotation),
    "unknown_consensus",
    validation_group_annotation
  )) |>
  dplyr::select(-consensus_annotation) |>
  dplyr::left_join(singler_df, by = c("cell_id", "barcodes", "library_id"))

# Recode NBAtlas cell types where possible so they match with colors
celltype_recoded_df <- celltype_df |>
  # rename to make annotation sources more clear
  dplyr::rename(
    "consensus_validation" = validation_group_annotation,
    "singler" = pruned.labels
  ) |>
  # pivot longer for wrangling
  tidyr::pivot_longer(
    c(consensus_validation, singler),
    names_to = "annotation_type",
    values_to = "label"
  ) |> 
  harmonize_celltypes(label, label_recoded) |>
  dplyr::select(-label)
```


## Cell types in NBAtlas

Throughout this notebook, we compare the cell type labels from `SingleR` with the consensus cell type labels.
To facilitate this, the resulting `SingleR` labels from `NBAtlas` were grouped together to both a) mirror (where possible) the broad consensus labels, and b) to make some visualizations more manageable.

The table below shows the `NBAtlas` cell types and how they are represented in visualizations, unless otherwise stated.

| Broad label shown in figures | Associated NBAtlas labels                                       |
| ---------------------------- | --------------------------------------------------------------- |
| `T cell`                     | `CD4+ T cell`, `CD8+ T cell`, `Treg`, `NKT cell`                |
| `dendritic cell`             | `cDC1`, `cDC2/DC3`, `Migratory cDC`                             |
| `natural killer cell cell`   | `Circulating NK cell`, `TOX2+/KIT+ NK cell`, `Resident NK cell` |
| `monocyte`                   | `CD4+ T cell`, `Classical monocyte`, `Patrolling monocyte`      |
| `myeloid`                    | `Neutrophil`                                                    |
| `Neuroendocrine`             | `Neuroendocrine`                                                |
| `fibroblast`                 | `Fibroblast`                                                    |
| `macrophage`                 | `Macrophage`                                                    |
| `B cell`                     | `B cell`                                                        |
| `plasma cell`                | `Plasma`                                                        |
| `endothelial`                | `Endothelial`                                                   |
| `Stromal other (NBAtlas)`    | `Stromal other`                                                 |
| `Schwann`                    | `Schwann`                                                       |
| `RBCs`                       | `RBCs`                                                          |
| `Immune cycling`             | `Immune cycling`                                                |
| `pDC`                        | `pDC`                                                           |

Cells labeled `singler_unknown` are those for which `SingleR` could not reliably assign an annotation.

## Heatmap

This section compares SingleR annotations to consensus cell type annotations using a heatmap.
The heatmap is colored by Jaccard similarity index.

```{r, fig.height = 10, fig.width = 10} 
# Pivot wider for heatmap functions
celltype_recoded_wide_df <- celltype_recoded_df |>
  tidyr::pivot_wider(
    names_from = annotation_type,
    values_from = label_recoded
  )

heatmap_col_fun <- circlize::colorRamp2(c(0, 1), colors = c("white", "darkslateblue"))

make_jaccard_heatmap(
  celltype_recoded_wide_df,
  "consensus_validation",
  "singler",
  "Consensus validation label",
  "SingleR NBAtlas label"
)
```

We broadly see high compatibility between SingleR and consensus labels.
In addition, SingleR mostly classifies the unknown consensus cells as Neuroendocrine, which is consistent with our expectation that these are mostly tumor cells.

## UMAPs

This section visualizes and compares SingleR annotations to consensus cell type annotations using UMAPs.
Note that the displayed UMAP is from a merged object that has _not been batch-corrected._

Cell type labels have been harmonized between sources wherever possible.
Note that each set of labels has its own "stromal" category which the labels distinguish.
In addition, cells labeled `unknown_consensus` are those with no assigned consensus label, and cells labeled `unknown_singler` are those where SingleR could not confidently assign a label.

### Complete UMAP

First, we display the consensus and SingleR annotations for all cells.

```{r, fig.width = 14, fig.height = 7}
ggplot(celltype_recoded_df) +
  aes(x = UMAP1, y = UMAP2, color = label_recoded) +
  geom_point(size = 0.25, alpha = 0.5) +
  scale_color_manual(
    values = recoded_celltype_pal,
    name = "Harmonized cell types"
  ) +
  facet_wrap(vars(annotation_type)) +
  coord_equal() +
  theme(
    legend.position = "bottom",
    axis.ticks = element_blank(),
    axis.text = element_blank()
  ) +
  guides(color = guide_legend(override.aes = list(size = 2, alpha = 1)))
```

### SingleR annotations only

Below we display a faceted UMAP to highlight the SingleR annotations.
Light gray cells in each panel represent all other cell types.

```{r fig.width = 16, fig.height = 16}
celltype_recoded_df |>
  dplyr::filter(annotation_type == "singler") |>
  faceted_umap(
    annotation_column = label_recoded,
    celltype_colors = recoded_celltype_pal
  )
```


### Faceted comparison to consensus cell types

Below, we display faceted UMAPs highlighting a single cell type but considering only cell types that have direct correspondence between SingleR and consensus cell types.
This allows us to see if the normal cells that SingleR is labeling correspond well to the normal cells identified by consensus labels.
Each row displays a cell type where the left column shows the consensus version and the right column shows the SingleR version.


In addition, we include a category below `putative-tumor` to directly compare the unknown consensus labels with the `Neuroendocrine` cells labeled by SingleR. 
While these categories are not necessarily directly comparable, they are each most likely to contain tumor cells.

Also, note that the `myeloid`-labeled SingleR cells are _only_ neutrophils, but the consensus myeloid grouping contains additional cell types.

```{r, fig.width=7, fig.height=28}
direct_celltype_matches <- c(
  "B cell",
  "T cell",
  "myeloid",
  "macrophage",
  "monocyte",
  "dendritic cell",
  "natural killer cell",
  "fibroblast",
  "endothelial cell",
  "plasma cell",
  "natural killer cell",
  # we'll use this label to be able to directly compare unknown_consensus to Neuroendocrine
  "putative-tumor"
)


celltype_facet_df <- celltype_recoded_df |>
  dplyr::mutate(
    # recode so Neuroendocrine matches with unknown_consensus in the plot
    label_recoded = ifelse(
      label_recoded %in% c("Neuroendocrine", "unknown_consensus"),
      "putative-tumor",
      label_recoded
    )) |>
  dplyr::filter(label_recoded %in% direct_celltype_matches)

faceted_umap(
  celltype_facet_df,
  annotation_column = label_recoded,
  celltype_colors = recoded_celltype_pal,
  facet_type = "grid",
  annotation_type_column = annotation_type
)
```

There appears to be a reasonable correspondence between consensus- and SingleR-colored UMAPS for each cell type.
We see that SingleR classified more cells compared to consensus, which we expected, and we also see that the additional cells that SingleR labeled are roughly in the same neighborhood as their corresponding consensus labels.


## Marker gene expression dotplots

In this section we'll validate annotations using two sets of marker genes:

* Marker genes identified in NBAtlas corresponding to the atlas cell types
  * This will tell us if we are picking up comparable signal in our data that plays well with NBAtlas
* Consensus cell type marker genes
  * This will provide an independent assessment of the reliability of SingleR normal cell type assignments 

```{r}
# Prepare the expressed_genes vector
# we only care about if that gene is expressed otherwise we won't waste memory and include it
gene_sums <- rowData(merged_sce) |>
  as.data.frame() |>
  dplyr::select(contains("detected")) |>
  as.matrix() |>
  rowSums()
expressed_genes <- names(gene_sums)[gene_sums > 0]
```



### NBAtlas marker genes

This dotplot shows the top-5 highest log2FC marker genes per NBAtlas cell type for validation.
Marker genes are taken directly from the NBAtlas paper where they were identified with `Seurat::FindMarkers()`. 

This plot does not show the recoded SingleR labels but rather all labels that have associated marker genes; therefore, there are more cell type categories in this plot than in other plots in this report.

```{r}
# number of marker genes to consider per validation group
n_marker_genes <- 5

top_nbatlas_markers_df <- nbatlas_markers_df |>
  # keep only the top `n_marker_genes`
  dplyr::filter(direction == "up") |>
  dplyr::group_by(NBAtlas_label) |>
  dplyr::mutate(rank_lfc = rank(-avg_log2FC)) |>
  dplyr::ungroup() |>
  dplyr::filter(rank_lfc <= n_marker_genes) |>
  dplyr::select(marker_gene_label = NBAtlas_label, gene_symbol, ensembl_gene_id)
```


```{r, fig.width = 24, fig.height = 12}
# we want to plot the literal labels here
full_celltype_df <- celltype_df |>
  dplyr::select(
    cell_id, 
    # the dotplot function expects this column name
    label_recoded = pruned.labels
  ) |>
  dplyr::mutate(
    label_recoded = dplyr::case_when(
      # collapse the dendritic cells & monocytes to match what's in the marker genes
      stringr::str_detect(label_recoded, "cDC") ~ "cDC", 
      stringr::str_detect(label_recoded, "monocyte") ~ "Monocyte",
      label_recoded == "NE" ~ "Neuroendocrine",
      .default = label_recoded
  )) |>
  dplyr::filter(label_recoded != "unknown_singler")


# get total number of cells per final annotation group and set up y_label
total_cells_df <- full_celltype_df |>
  dplyr::count(label_recoded, name = "total_cells") |>
  dplyr::arrange(desc(total_cells)) |>
  dplyr::mutate(y_label = glue::glue("{label_recoded} ({total_cells})"))

singler_order <- total_cells_df$y_label
total_cells_df$y_label <- factor(total_cells_df$y_label, levels = singler_order)
nbatlas_bar_order <- total_cells_df$label_recoded
```


```{r, fig.width = 32, fig.height = 16}
generate_dotplot(
  merged_sce = merged_sce,
  markers_df = top_nbatlas_markers_df,
  singler_df = full_celltype_df,
  total_cells_df = total_cells_df,
  expressed_genes = expressed_genes,
  bar_order = nbatlas_bar_order, 
  celltype_palette = nbatlas_celltype_pal, 
  min_cells = 0
)
```
For validation, we expect to see high marker gene expression along the diagonal of the dotplot where cell type labels match with their marker genes.
We do broadly see high expression along the diagonal, which means our labeled cells have some amount of similar signal to those in NBAtlas.
Some sets of marker genes show high expression across multiple cell type categories in particular for closely related cell types (e.g. monocyte, macrophage, myeloid, dendritic, or T-cells and NK-cells), but there do not appear to be any strongly unexpected gene expression patterns.



### Consensus validation marker genes

This dotplot shows expression of consensus validation marker genes across matching cell types labeled with SingleR.

```{r}
# Prepare data frame with singler labels to plot
singler_recoded_df <- celltype_recoded_wide_df |>
  # we don't consider the NAs here
  dplyr::filter(singler != "unknown_singler") |>
  dplyr::select(cell_id, label_recoded = singler)


# get total number of cells per final annotation group and set up y_label
total_cells_df <- singler_recoded_df |>
  dplyr::count(label_recoded, name = "total_cells") |>
  dplyr::arrange(desc(total_cells)) |>
  dplyr::mutate(y_label = glue::glue("{label_recoded} ({total_cells})"))

singler_order <- total_cells_df$y_label
total_cells_df$y_label <- factor(total_cells_df$y_label, levels = singler_order)
```



```{r}
# prepare markers for dotplot
dotplot_consensus_markers_df <- consensus_markers_df |>
  # consider only matching labels
  dplyr::filter(validation_group_annotation %in% singler_recoded_df$label_recoded) |>
  dplyr::select(
    marker_gene_label = validation_group_annotation,
    ensembl_gene_id,
    gene_symbol
  )

# get the bar order
consensus_bar_order <- total_cells_df |>
  dplyr::select(label_recoded, y_label) |>
  # inner!!!!
  dplyr::inner_join(
    dotplot_consensus_markers_df, 
    by = c("label_recoded" = "marker_gene_label")
  ) |>
  dplyr::arrange(y_label) |>
  dplyr::pull(label_recoded) |>
  unique()
```


```{r, fig.width = 26, fig.height = 12}
generate_dotplot(
  merged_sce = merged_sce,
  markers_df = dotplot_consensus_markers_df,
  singler_df = singler_recoded_df,
  total_cells_df = total_cells_df,
  expressed_genes = expressed_genes,
  bar_order = consensus_bar_order, 
  celltype_palette = recoded_celltype_pal, 
  min_cells = 0
)
```

Again, marker gene expression tends to be high in the matching cell type categories, with the possible exception of `B cell` which has somewhat lower gene expression of fewer genes; but, there is still some expected expression there.
We also see some "promiscuity" where marker genes for closely-related cell types show high expression. 


## Compare expression when SingleR and consensus annotations match

Next, we look at expression of consensus validation marker genes for each normal/harmonized cell type label.
We compare expression across groups of cells:

* `matching annotations`: Cells whose consensus annotation matches the SingleR annotation
* `different annotations`: Cells whose consensus annotation differs from the SingleR annotation
* `consensus not classified`: Cells where the consensus annotation is unknown

Each panel contains only cells that SingleR annotated with the given label, and shows only marker genes associated with that given cell type. 
We expect that cells whose consensus and SingleR annotations agree may show higher gene expression for the given marker.

```{r}
# this time we are not considering putative tumor
direct_celltype_matches <- c(
  "B cell",
  "T cell",
  # note that we need to collapse the consensus to compare
  "myeloid",
  "fibroblast",
  "endothelial cell",
  "plasma cell",
  "natural killer cell", 
  "unknown_consensus" # we do want to keep these for this analysis
)

# Define the different myeloid types in consensus annotations
consensus_myeloid <- c("myeloid", "dendritic cell", "monocyte", "macrophage")


# Data frame with column `compare_annotations` that indicates
# if consensus and singler (roughly) match
compare_annotations_df <- celltype_recoded_df |>
  # recode labels for comparison
  dplyr::mutate(
    label_recoded = ifelse(
      label_recoded %in% consensus_myeloid,
      "myeloid",
      label_recoded
  )) |>
  dplyr::filter(label_recoded %in% direct_celltype_matches) |>
  dplyr::select(cell_id, annotation_type, label_recoded) |>
  # make it wider to label the situation
  tidyr::pivot_wider(
    names_from = "annotation_type",
    values_from = "label_recoded"
  ) |>
  # remove NAs:
  # - consensus NAs are those with other cell types not to include here
  # - singler NAs were unclassified cells
  tidyr::drop_na() |>
  dplyr::mutate(
    compare_annotations = dplyr::case_when(
      consensus_validation == "unknown_consensus" ~ "consensus not classified", 
      consensus_validation == singler ~ "matching annotations",
      consensus_validation != singler ~ "different annotations"
    ), 
    compare_annotations = forcats::fct_relevel(compare_annotations, "matching annotations", "different annotations", "consensus not classified")
  ) |>
  # we can remove the consensus annotation now
  dplyr::select(cell_id, singler, compare_annotations)


direct_markers_df <- consensus_markers_df |>
  dplyr::filter(validation_group_annotation %in% direct_celltype_matches) |>
  dplyr::select(marker_gene_label = validation_group_annotation, ensembl_gene_id, gene_symbol)
cells <- unique(compare_annotations_df$cell_id)
genes <- unique(direct_markers_df$ensembl_gene_id)
```


```{r}
compare_annotations_expr_df <- scuttle::makePerCellDF(
  merged_sce[genes, cells],
  features = genes,
  assay.type = "logcounts",
  use.coldata = "cell_id",
  use.dimred = FALSE
) |>
  tidyr::pivot_longer(starts_with("ENSG"), names_to = "ensembl_gene_id", values_to = "logcounts") |>
  # filter to only cells with expression
  dplyr::filter(logcounts > 0) |>
  # join other data frames
  dplyr::left_join(compare_annotations_df, by = "cell_id") |>
  dplyr::left_join(direct_markers_df, by = "ensembl_gene_id", relationship = "many-to-many")
```


```{r, fig.width = 10, fig.height = 20}
unique(compare_annotations_expr_df$singler) |>
  purrr::map(
    \(celltype) {
      
      plot_df <- compare_annotations_expr_df |>
        dplyr::filter(singler == celltype, marker_gene_label == celltype) 
      
      ggplot(plot_df) + 
        aes(x = gene_symbol, y = logcounts, fill = compare_annotations) +
        geom_boxplot(outlier.size = 0.25) + 
        ggtitle(glue::glue("cells annotated as: {celltype}"))
        
  }) |>
  patchwork::wrap_plots(ncol = 1) + patchwork::plot_layout(guides = "collect")
```

In most cases, gene expression for cells with matching consensus/SingleR annotations is higher or about the same as is gene expression for cells with different annotations.

## Session Info

```{r session info}
# record the versions of the packages used in this analysis and other environment information
sessionInfo()
```
